library(tidyverse)
library(dplyr)
library(maps)
library(spBayes)
library(viridis)

########
# Datasets description
# PA timestamp is recorded in GMT
# EPA timestamp is recorded in both GMT and EST
# However, the time difference between GMT and EST is always the same (5 hr)
# Which means that EPA timestamp may not account for the daytime light saving
# The latest EPA update is 08/31/2020

setwd("/Users/hongjianyang/Research/PA/")
source('Code/Tools/myFunc.R')

###########################################################################
###########################################################################
###########################################################################
# 2020 Data
# Explore variance between 07/01/2020 - 07/07/2020
# 29 epa stations, 89 purple air stations (07/01)
###########################################################################
###########################################################################
###########################################################################
pa2020 <- read.csv('Data/NC_PA_FRM_PM/PA_2020_hourly_PM_NC.csv')
epa2020 <- read.csv('Data/NC_PA_FRM_PM/FRM_2020_hourly_PM_NC.csv')
# Subset data
col <- c('Lon', 'Lat', 'Timestamp', 'PM25_corrected')
pa <- pa2020 %>%
  select(col) %>%
  rename(PM25 = PM25_corrected) %>%
  group_by(Lon, Lat, Timestamp) %>%
  summarise(PM25 = mean(PM25))

pa <- pa[order(pa$Lon, pa$Lat, pa$Timestamp), ]
# Remove NAs
pa <- pa[complete.cases(pa), ]

# Process EPA data
epa2020$Timestamp <- paste(epa2020$Date_GMT, epa2020$Hour_GMT)
epa2020$Timestamp <- paste(epa2020$Timestamp, ':00', sep = '')
col <- c('Lon', 'Lat', 'Timestamp', 'PM25')
epa <- epa2020 %>% 
  select(col)
epa <- epa[order(epa$Lon, epa$Lat, epa$Timestamp), ]

# Get 07/01/2020 - 07/07/2020 data
pa$Timestamp <- as.POSIXct(pa$Timestamp)
epa$Timestamp <- as.POSIXct(epa$Timestamp, format = '%Y-%m-%d %H:%M:%OS')
startTime <- as.POSIXct('2020-07-01 00:00:00')
endTime <- as.POSIXct('2020-07-07 23:59:59')
pa <- subset(pa, Timestamp <= endTime & Timestamp >= startTime)
epa <- subset(epa, Timestamp <= endTime & Timestamp >= startTime)

start <- as.numeric(startTime)
end <- as.numeric(as.POSIXct('2020-07-07 23:00:00'))
diff <- end - start

epa$indicator = 1
pa$indicator = 0

df_combine <- rbind(epa, pa)

# Construct NC prediction locations
s01   <- seq(-90,-65,0.1) # Lon
s02   <- seq(30,38,0.1) # Lat
s0    <- as.matrix(expand.grid(s01,s02))
innc <- map.where(map('state', 'north carolina', fill = TRUE), s0[,1], s0[,2])

s0    <- s0[!is.na(innc),]

# X0 is the prediction covariates
interim <- s0
interim <- dfColNorm(interim)
X0 <- locOpe(interim[, 1], interim[, 2])
X0 <- cbind(X0, 0)

n_timestamp <- diff / 3600 + 1
one_var <- vector(length = n_timestamp)

n.samples <- 25000
burn      <- 5000

# Model: Y = \beta_{0} + \beta_{1}*Lon + \beta_{2}*Lat + \beta_{3}*Lon^2 + 
# \beta_{4}*Lat^2 + \beta_{5}*LonLat + \beta_{6}I(epa)
for (i in 0:(n_timestamp - 1)) {

  current <- as.POSIXct(3600 * i, origin = startTime)

  
  subdf <- subset(df_combine, Timestamp == current)
  S <- data.matrix(subdf[, 1:2])
  maxd      <- max(dist(S))
  
  # Construct covarites
  interim <- S
  interim[, 1] <- (interim[, 1] - mean(interim[, 1])) / sd(interim[, 1])
  interim[, 2] <- (interim[, 2] - mean(interim[, 2])) / sd(interim[, 2])
  X <- locOpe(interim[, 1], interim[, 2])
  X <- cbind(X, subdf$indicator)
  
  
  
  # Kriging
  Y <- subdf$PM25
  
  starting  <- list("phi"=1/(0.05*maxd), "sigma.sq"=0.5*var(Y), "tau.sq"=0.5*var(Y))
  tuning    <- list("phi"=1, "sigma.sq"=0.05*var(Y), "tau.sq"=0.05*var(Y))
  amcmc     <- list("n.batch"=n.samples/100, "batch.length"=100, "accept.rate"=0.43)
  priors    <- list("beta.Norm"=list(rep(0,7), 100*diag(7)),
                    "phi.Unif"=c(1/(2*maxd), 1/(0.01*maxd)),
                    "sigma.sq.IG"=c(2, 1),
                    "tau.sq.IG"=c(2, 1))
  
  fit_spbayes  <- spLM(Y~X-1, coords=S,
                       starting=starting, tuning=tuning, 
                       priors=priors, cov.model="exponential",
                       amcmc=amcmc, n.samples=n.samples,verbose=FALSE)
  
  pred <- spPredict(fit_spbayes, pred.coords=s0, pred.covars=X0, 
                    start=burn, thin=10, verbose=FALSE)
  
  pred <- pred$p.y.predictive.samples
  
  Yhat  <- apply(pred,1,mean)
  Ysd   <- apply(pred,1,sd)
  
  df <- data.frame(long=s0[,1],lat=s0[,2],Y=Ysd)
  # Samples near RTP area
  # Longitude range: -78.89, -78.52; Latitude range: 35.71, 35.92
  rtp_sample <- subset(df, df$long <=-78.52 & df$long >= -78.89 & df$lat >= 35.71
                       & df$lat <= 35.92)
  rtp_std <- mean(rtp_sample$Y)
  one_var[i + 1] = rtp_std
  # Plot standard deviation
  spa <- sum(subdf$indicator == 0)
  pred_var <- ggplot(df, aes(long, lat)) +
    geom_polygon(aes(x=long, y=lat)) +
    geom_raster(aes(fill = Y)) +
    geom_point(data = subdf, aes(Lon, Lat), colour = 'purple', size = 0.5) +
    scale_fill_gradientn(colours = viridis(10))+
    xlab(paste(current, 'pa:' ,spa))+ylab("")+labs(title="Posterior predictive standard deviation")+
    coord_fixed()
  plot(pred_var)
}

map <- map_data("county")
nc <- subset(map, region =="north carolina")
R <- ggplot(data=nc) + 
  geom_polygon(aes(x=long, y=lat, group = group)) +
  geom_point(data = subdf, aes(Lon, Lat), colour = 'purple', size = 0.5) +
  coord_fixed()
R

one_var
##   [1]  1.9048368  1.5832198  1.5200750  1.6563294  1.4398364  1.5934195
##   [7]  1.4973053  1.3427967  1.4712567  1.3217118  1.4989728  5.0032899
##  [13]  0.9792305  1.3363501  3.0152614  1.9947599  1.4174218  1.2175833
##  [19]  2.3087290  1.2261241  1.4834114  3.5019689  1.7703452  1.8094301
##  [25]  1.5860019  1.6026113  1.7426921  1.5841446  1.7294246  1.7342763
##  [31]  1.3831059  1.3724256  1.6106443  1.1407127  1.2730400  1.1259907
##  [37]  1.2192255  1.5158505  1.5773947  1.0800112  0.8631125  1.0146638
##  [43]  1.3478049  1.3111975  1.2398468  1.3112580  1.4580665  1.3840458
##  [49]  1.6054990  3.5155082  2.7428236  3.2419055  2.6611634  3.0540734
##  [55]  2.8503498  2.7266901  2.9118467  2.0380908  1.9859960  1.8466596
##  [61]  2.0427892  1.7610580  1.6490430  1.4626731  0.9646056  1.8512729
##  [67]  1.3290973  2.1195550  1.4925464  1.3499480 14.6531890  9.2243556
##  [73]  1.9231597  3.2265338  3.5711976  4.2417595  4.4170137  5.3389522
##  [79]  4.0870788  3.7464255  3.0644302  2.9690865  2.9395986  2.3930836
##  [85]  2.4108763  2.5963960  2.3664499  2.6909745  2.3122420  1.8210420
##  [91]  2.8446473  1.8336552  2.3167879  2.1936964  2.7104716  3.0800347
##  [97]  2.8019361 16.4045181 25.0712051 19.9969423 16.7350831 16.7473956
## [103] 12.0809767 11.8051846  9.1699250  6.5857246  4.5789852  3.6883366
## [109]  2.9584011  2.5291041  2.4177612  2.5924377  2.1150160  1.8972906
## [115]  2.7303757  2.5666619  1.7726310  1.8439864  4.3175927  2.3918483
## [121]  1.9799893  3.8631553  2.5073974  1.9745292  2.0344041  1.8943319
## [127]  1.8298457  2.0460701  1.7995480  2.3591586  1.7315899  2.4513072
## [133]  1.7956905  1.7805624  2.2439236  1.1747335  1.9654415  1.7894427
## [139]  4.2895419  1.1891301  1.5994178  1.3230480  2.1357277  2.1292956
## [145]  1.5660845  1.8231866  2.1490869  1.5563236  2.1251874  1.5657276
## [151]  1.4879444  1.6608017  1.7666723  1.6862213  2.1901775  2.2921440
## [157]  2.0486098  1.7476744  1.6288742  1.6411623  1.2071630  1.6836439
## [163]  1.3765771  3.4007835  1.2159221  1.2555763  0.7956735  0.8614610
for (i in 1:7) {
  plot(one_var[(24*(i-1)+1):((24*i))], type = 'b')
}

plot(one_var, type = 'b')